DMRscaler is a method designed to identify features of differential methylation between case and control conditions across a wide range of genomic scale. In the first part of the vignette we step through downloading a publicly available dataset, running the statistical test for difference at individual CG site and a permutation test for estimating enrichment of differentially methylated CGs within a window. After this we run DMRscaler to call DMRs at each layer of window sizes generating a list of results. We then look at the data to visualize how DMRscaler has called and represents these DMR features.
Download the most recent version of DMRscaler from github and place in working directory, if DMRscaler is downloaded to another directory, change the “path_to_DMRscaler” variable below to the path to the DMRscaler directory.
<- "../DMRscaler"
path_to_DMRscaler
if(!require("devtools", quietly = TRUE )){
install.packages("devtools")
library("devtools")
}if(!require("DMRscaler")){
::install(path_to_DMRscaler)
devtoolslibrary("DMRscaler")
}
We will use data from GSE149960 from (Köhler F. et al, 2020) with DNA methylation from fibroblasts from progeria patients and controls measured on the Illumina methylation EPIC array.
if(!require("BiocManager", quietly = TRUE )){
install.packages("BiocManager")
library("BiocManager")
}if(!require("GEOquery")){
::install("GEOquery")
BiocManagerlibrary("GEOquery")
}
## get sample phenotype data table
<- getGEO("GSE149960", GSEMatrix = TRUE)
gse #> Found 1 file(s)
#> GSE149960_series_matrix.txt.gz
<- gse$GSE149960_series_matrix.txt.gz@phenoData@data
phen rm(gse)
## get methylation data as idat files (NOTE: this saves files locally in working directory,
## unpacked size is 411 Mb)
if(!dir.exists("GSE149960/idat")){
## note: some people have issues using GEOquery to download
## if this is the case, manually downloading into the data
## into the working directory may be necessary
getGEOSuppFiles("GSE149960")
untar("GSE149960/GSE149960_RAW.tar", exdir = "GSE149960/idat")
file.remove("GSE149960/GSE149960_RAW.tar")
}<- list.files("GSE149960/idat", pattern = "idat.gz$", full = TRUE)
idat_files sapply(idat_files, gunzip, overwrite = TRUE); rm(idat_files)
Preprocessing of idat files is done with minfi here.
if(!require("minfi")){
::install("minfi")
BiocManagerlibrary("minfi")
}if(!require("IlluminaHumanMethylationEPICmanifest")){
::install("IlluminaHumanMethylationEPICmanifest")
BiocManagerlibrary("IlluminaHumanMethylationEPICmanifest")
}
### Reading of idat files done with minfi library ###
<-"GSE149960/idat"
idats_dir<- read.metharray.exp(base = idats_dir)
RGSet <- preprocessFunnorm(RGSet);rm(RGSet)
GRset.funnorm #> [preprocessFunnorm] Background and dye bias correction with noob
#> Loading required package: IlluminaHumanMethylationEPICanno.ilm10b4.hg19
#> [preprocessFunnorm] Mapping to genome
#> [preprocessFunnorm] Quantile extraction
#> [preprocessFunnorm] Normalization
<- getSnpInfo(object = GRset.funnorm)
snps <- dropLociWithSnps(GRset.funnorm, snps=c("SBE", "CpG"), maf=0);rm(snps)
GRset.funnorm rm(idats_dir)
if(!require("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")){
::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
BiocManagerlibrary("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
}
<- grep("control",phen$title)
controls <- grep("hgps", phen$title)
cases <- getLocations(GRset.funnorm)
locs <- data.frame("names"=locs@ranges@NAMES, "pos"=locs@ranges@start,
locs "chr" = rep(locs@seqnames@values, locs@seqnames@lengths))
<- getBeta(GRset.funnorm) B
We use the non-parametric wilcox test for individual CG significance calculations and permutation of these significance values for estimating window level significance.
if(!require("doParallel", quietly = TRUE )){
install.packages("doParallel")
library("doParallel")
}if(!require("rlang", quietly = TRUE )){
install.packages("rlang")
library("rlang")
}if(!require("dplyr", quietly = TRUE )){
install.packages("dplyr")
library("dplyr")
}if(!require("foreach", quietly = TRUE )){
install.packages("foreach")
library("foreach")
}
registerDoParallel(detectCores())
<- DMRscaler::run_MWW(control_indices = controls ,
mwr case_indices = cases ,
Beta = B)
$chr <- as.factor(locs$chr)
locs$scoring_values <- mwr$p_val
locs$pval <- mwr$p_val
locs
## get_loc_fdr_pval permutations can take a while... consider using more liberal fdr
## threshold for estimation of pval_cutoff or manually setting pval_cutoff.
<- 0.2
fdr <- get_loc_fdr_pval(B, cases,controls, wilcox.test, fdr=fdr, return_table = T)
pcut_table #> [1] "Running 10 permutations across 12 worker nodes"
<- pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)]
pvals_below_fdr_cut if(length(pvals_below_fdr_cut)==0 ){
warning("desired cpg level fdr not achieved at any pvalue threshold, consider using more liberal fdr cutoff")
}<- 10^max(pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)])
pval_cutoff print(paste("p-value cutoff set to ", signif(pval_cutoff,2), sep=""))
#> [1] "p-value cutoff set to 0.015"
Uses significance values associated with each CG location and the values from the statistical test and permutations along with windows of adjacent measured CGs defined by the layer_sizes ToDo: This can take
<- DMRscaler::dmrscaler(locs=locs,
dmrscaler_result locs_pval_cutoff = pval_cutoff,
region_signif_method = "ben",
region_signif_cutoff = 0.05,
window_type = "k_nearest",
window_sizes = c(2,4,8,16,32,64,128),
output_type = "complete")
head(dmrscaler_result[[length(dmrscaler_result)]])
#> chr start stop pval_region_raw pval_region_adj mean_cg_signif dmr_id
#> 1 chr1 695763 861090 3.069357e-06 2.241680e-04 2.462490 1
#> 2 chr1 1758610 1846648 1.708995e-05 1.248151e-03 2.401581 2
#> 3 chr1 2008579 2040968 4.717228e-05 3.445190e-03 2.255321 3
#> 4 chr1 2125182 2191299 1.205442e-20 8.803847e-19 2.491284 4
#> 5 chr1 2407304 2436883 7.677514e-10 5.607210e-08 2.451706 5
#> 6 chr1 2724826 2948512 2.623115e-43 1.915771e-41 2.411316 6
<- data.frame(chr="chr7",start=155616381, stop=159033499,
example_region stringsAsFactors = FALSE)
if(!require("circlize", quietly = TRUE )){
install.packages("circlize")
library("circlize")
}if(!require("HilbertCurve", quietly = TRUE )){
::install("HilbertCurve")
BiocManagerlibrary("HilbertCurve")
}
= colorRamp2(c(0,-log10(0.05),-log10(0.01),max(-log10(locs$pval))), c("grey30", "grey60", "red", "red"))
col_fun = 6
level ## 4^level - 1 = # segments
<- locs[which(locs$chr== example_region$chr ),]
hc_points <- nrow(hc_points)
hc_max <- col_fun(-log10(hc_points$pval))
hc_col <- -log10(hc_points$pval) / max(-log10(hc_points$pval))
hc_size
# hc_size <- hc_points$scoring_values / fdrscaler
<- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr)
hc hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*2, "mm"),
gp = gpar(col = hc_col, fill = hc_col))
hc_polygon(hc, x1 = min(which(hc_points$pos >= example_region$start)),
x2 = max(which(hc_points$pos <= example_region$stop)) )
### now looking only at the specified region
= 4
level <- locs[which(locs$chr== example_region$chr &
hc_points $pos >= example_region$start &
locs$pos <= example_region$stop ),]
locs<- nrow(hc_points)
hc_max <- col_fun(-log10(hc_points$pval))
hc_col <- -log10(hc_points$pval) / max(-log10(hc_points$pval))
hc_size
<- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr)
hc hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*4, "mm"),
gp = gpar(col = hc_col, fill = hc_col))
DMRscaler defines differential methylation features iteratively, expanding the size of the window for aggregating on at each step this procedure allows a hierarchical structure to describe a region enriched in differntially methylated CpGs. We look at our example region here
if(!require("networkD3", quietly = TRUE )){
install.packages("networkD3")
library("networkD3")
}
<- example_generate_dmr_tree(dmrscaler_result = dmrscaler_result,
dmr_tree layer=length(dmrscaler_result),
chr=example_region$chr,
start = example_region$start,
stop = example_region$stop)
diagonalNetwork(List = dmr_tree, fontSize = 12, fontFamily = "bold",
nodeStroke = "black", linkColour = "black", opacity = 1 )
Next we can look at the region using the Gviz package
if(!require("Gviz")){
::install("Gviz")
BiocManagerlibrary("Gviz")
}#> Loading required package: Gviz
if(!require("biomaRt")){
::install("biomaRt")
BiocManagerlibrary("biomaRt")
}#> Loading required package: biomaRt
## organize data used in tracks
<- character(length=ncol(B))
group <- "case"
group[cases] <- "control"
group[controls] <- rowMeans(B[,cases]) - rowMeans(B[,controls])
delta_mean_B <- which(locs$chr==example_region$chr & locs$pos >= example_region$start & locs$pos <= example_region$stop)
which <- B[which, ]
Bsub <- GRanges(seqnames = example_region$chr, ranges = IRanges(start = locs[which,"pos"], width = 1), mcols = Bsub )
gr
## set up ideogram track
<-IdeogramTrack(genome="hg19", chromosome = example_region$chr)
itrack## set up genome axis track
<-GenomeAxisTrack()
gtrack## set up significance track
<- DataTrack(range=gr, data = -log10(locs$pval[which]), type=c("p"))
sig_track ## set up beta value track
<- DataTrack(range = gr, data = t(Bsub), groups=group,name="Beta", type=c("a","g","confint"))
beta_track ## set up diff beta track
<- DataTrack(range = gr, data = delta_mean_B[which], type=c("a","g"))
diff_beta_track
## set up gene model track ###
<- example_gene_anno_df(chr=example_region$chr, start=example_region$start, stop=example_region$stop)
genesub if(!all(is.na(genesub))){
<- GeneRegionTrack(genesub, chromosome = example_region$chr, start = example_region$start, stop=example_region$stop, transcriptAnnotation="symbol" ,collapseTranscripts = "meta")
grtrack else{grtrack<-GeneRegionTrack()}
}
## set up DMRscaler results layer tracks
<- list()
layer_track for(j in 1:length(dmrscaler_result)){
if(nrow(dmrscaler_result[[j]])==0){
<- AnnotationTrack()
layer_track[[j]] else {
} <- AnnotationTrack(start = dmrscaler_result[[j]]$start,
layer_track[[j]] end = dmrscaler_result[[j]]$stop,
chromosome = dmrscaler_result[[j]]$chr,
strand = "*", genome = "hg19", name=paste("layer", j, sep = "_"))
displayPars(layer_track[[j]]) <- list(cex.legend=0.7, cex.axis=0.7, cex.title=0.7, rotation.title=0, stackHeight = 1, shape="box")
}
}
plotTracks(c(list(itrack, gtrack, sig_track, beta_track, diff_beta_track, grtrack), layer_track), from = example_region$start-1, to=example_region$stop+1 )
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] grid parallel stats4 stats graphics grDevices utils
#> [8] datasets methods base
#>
#> other attached packages:
#> [1] biomaRt_2.52.0
#> [2] Gviz_1.40.1
#> [3] networkD3_0.4
#> [4] HilbertCurve_1.26.0
#> [5] circlize_0.4.15
#> [6] dplyr_1.0.9
#> [7] rlang_1.0.3
#> [8] doParallel_1.0.17
#> [9] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
#> [10] IlluminaHumanMethylationEPICmanifest_0.3.0
#> [11] minfi_1.42.0
#> [12] bumphunter_1.38.0
#> [13] locfit_1.5-9.5
#> [14] iterators_1.0.14
#> [15] foreach_1.5.2
#> [16] Biostrings_2.64.0
#> [17] XVector_0.36.0
#> [18] SummarizedExperiment_1.26.1
#> [19] MatrixGenerics_1.8.1
#> [20] matrixStats_0.62.0
#> [21] GenomicRanges_1.48.0
#> [22] GenomeInfoDb_1.32.2
#> [23] IRanges_2.30.0
#> [24] S4Vectors_0.34.0
#> [25] GEOquery_2.64.2
#> [26] Biobase_2.56.0
#> [27] BiocGenerics_0.42.0
#> [28] BiocManager_1.30.18
#> [29] DMRscaler_0.0.0.9001
#> [30] devtools_2.4.3
#> [31] usethis_2.1.6
#>
#> loaded via a namespace (and not attached):
#> [1] utf8_1.2.2 R.utils_2.12.0
#> [3] tidyselect_1.1.2 RSQLite_2.2.14
#> [5] AnnotationDbi_1.58.0 htmlwidgets_1.5.4
#> [7] BiocParallel_1.30.3 munsell_0.5.0
#> [9] codetools_0.2-18 preprocessCore_1.58.0
#> [11] interp_1.1-2 colorspace_2.0-3
#> [13] filelock_1.0.2 highr_0.9
#> [15] knitr_1.39 rstudioapi_0.13
#> [17] GenomeInfoDbData_1.2.8 bit64_4.0.5
#> [19] rhdf5_2.40.0 vctrs_0.4.1
#> [21] generics_0.1.3 xfun_0.31
#> [23] biovizBase_1.44.0 BiocFileCache_2.4.0
#> [25] R6_2.5.1 illuminaio_0.38.0
#> [27] AnnotationFilter_1.20.0 bitops_1.0-7
#> [29] rhdf5filters_1.8.0 cachem_1.0.6
#> [31] reshape_0.8.9 DelayedArray_0.22.0
#> [33] assertthat_0.2.1 BiocIO_1.6.0
#> [35] scales_1.2.0 nnet_7.3-17
#> [37] gtable_0.3.0 processx_3.6.1
#> [39] ensembldb_2.20.2 genefilter_1.78.0
#> [41] GlobalOptions_0.1.2 splines_4.2.1
#> [43] lazyeval_0.2.2 rtracklayer_1.56.1
#> [45] dichromat_2.0-0.1 checkmate_2.1.0
#> [47] yaml_2.3.5 backports_1.4.1
#> [49] GenomicFeatures_1.48.3 Hmisc_4.7-0
#> [51] tools_4.2.1 nor1mix_1.3-0
#> [53] ggplot2_3.3.6 ellipsis_0.3.2
#> [55] jquerylib_0.1.4 RColorBrewer_1.1-3
#> [57] siggenes_1.70.0 sessioninfo_1.2.2
#> [59] Rcpp_1.0.8.3 plyr_1.8.7
#> [61] base64enc_0.1-3 sparseMatrixStats_1.8.0
#> [63] progress_1.2.2 zlibbioc_1.42.0
#> [65] purrr_0.3.4 RCurl_1.98-1.7
#> [67] ps_1.7.1 prettyunits_1.1.1
#> [69] rpart_4.1.16 deldir_1.0-6
#> [71] openssl_2.0.2 cluster_2.1.3
#> [73] fs_1.5.2 magrittr_2.0.3
#> [75] data.table_1.14.2 ProtGenerics_1.28.0
#> [77] pkgload_1.3.0 hms_1.1.1
#> [79] evaluate_0.15 xtable_1.8-4
#> [81] XML_3.99-0.10 jpeg_0.1-9
#> [83] mclust_5.4.10 gridExtra_2.3
#> [85] shape_1.4.6 compiler_4.2.1
#> [87] tibble_3.1.7 crayon_1.5.1
#> [89] R.oo_1.25.0 htmltools_0.5.2
#> [91] tzdb_0.3.0 Formula_1.2-4
#> [93] tidyr_1.2.0 DBI_1.1.3
#> [95] dbplyr_2.2.1 MASS_7.3-57
#> [97] rappdirs_0.3.3 Matrix_1.4-1
#> [99] readr_2.1.2 cli_3.3.0
#> [101] quadprog_1.5-8 R.methodsS3_1.8.2
#> [103] igraph_1.3.2 pkgconfig_2.0.3
#> [105] GenomicAlignments_1.32.0 foreign_0.8-82
#> [107] xml2_1.3.3 annotate_1.74.0
#> [109] bslib_0.3.1 rngtools_1.5.2
#> [111] multtest_2.52.0 beanplot_1.3.1
#> [113] doRNG_1.8.2 scrime_1.3.5
#> [115] VariantAnnotation_1.42.1 stringr_1.4.0
#> [117] callr_3.7.0 digest_0.6.29
#> [119] rmarkdown_2.14 base64_2.0
#> [121] polylabelr_0.2.0 htmlTable_2.4.0
#> [123] HilbertVis_1.54.0 DelayedMatrixStats_1.18.0
#> [125] restfulr_0.0.15 curl_4.3.2
#> [127] Rsamtools_2.12.0 rjson_0.2.21
#> [129] lifecycle_1.0.1 nlme_3.1-157
#> [131] jsonlite_1.8.0 Rhdf5lib_1.18.2
#> [133] askpass_1.1 limma_3.52.2
#> [135] BSgenome_1.64.0 fansi_1.0.3
#> [137] pillar_1.7.0 lattice_0.20-45
#> [139] KEGGREST_1.36.2 fastmap_1.1.0
#> [141] httr_1.4.3 pkgbuild_1.3.1
#> [143] survival_3.2-13 glue_1.6.2
#> [145] remotes_2.4.2 png_0.1-7
#> [147] bit_4.0.4 stringi_1.7.6
#> [149] sass_0.4.1 HDF5Array_1.24.1
#> [151] blob_1.2.3 latticeExtra_0.6-30
#> [153] memoise_2.0.1